Author: Rosa Filgueira
In this notebook we are going to perform different experiments, at different levels: events, schedules, performances and tickets. Furthermore, this notebook allows us to select:
These values are introduced in the Parameters Selection section of this notebook.
Prerequisite: You need to have run (at least once) the Generate_List_Dataframes notebook, and have all the dataframes (df_places, df_new_events, df_p, dp_schedule_revenue, df_tickets, df_p_properties, etc.)
As follows we have the schema of events data.
The most important remarks are:
-1. One event can have more than 1 schedule
-2. One schedule can last more than a day: start_ts and end_ts, could compromise a period of several days
-3. One schedule can have more than 1 performance
-4. All performances of each schedule are always in the same place (schedule--> place_id).
-5. A performance has a timestamp (ts)
-6. A performance can have several tickets
-7. A ticket can have max_price and min_price
This is the list of experiments that we are going to perform.
Important: In order to run this notebook you need to have run the Generating_List_DataFrames notebook. Since, this notebook assume that we have already all the dataframes stored (df_places, df_new_events, df_p, dp_schedule_revenue, df_tickets, df_p_properties) in the "dataframe" directory.
LDA Topic Modelling: This could take a long time to run depending on the city selected.
import json
import pandas as pd
import plotly.express as px
import os
import pickle
import plotly.graph_objects as go
import numpy as np
from dateutil.relativedelta import relativedelta
from os import listdir
from os.path import isfile, join
import datetime
from scipy import stats
import datetime
import plotly.io as pio
#pio.renderers.default = "browser"
import plotly.offline as pyo
IMPORTANT: This notebook assumes that you have alredy generated all the necessary dataframes and stored them in a directory. If not, run first *Generate_List_Dataframes notebook.
### Change those if you want to use a different path or folder names.
### For the dataframe_path, as you did for Generate_List_Dataframes notebook.
dataframe_path="./dataframe"
## This path will be used in section 5.
models_path="./models"
if not os.path.exists(models_path):
os.makedirs(models_path)
#This path will be used in section 3 - maps
html_path="./html_figures"
if not os.path.exists(html_path):
os.makedirs(html_path)
def month_to_string(month):
if month == 1 :
month_string="January"
elif month == 2:
month_string="February"
elif month == 3:
month_string= "March"
elif month == 4:
month_string="April"
elif month == 5:
month_string="May"
elif month == 6:
month_string="June"
elif month ==7 :
month_string="July"
elif month == 8:
month_string="August"
elif month ==9:
month_string="September"
elif month == 10:
month_string="October"
elif month ==11:
month_string="November"
else:
month_string="December"
return month_string
def check_city(city):
if city not in city_l:
print("Error! Change this city name: %s. Unfortunatly this one is not in the currently available" % city)
print("Go to the previous cell and change the name of the city, and run it again")
else:
print("Ok! You are going to study %s" %city)
def check_categories(list_categories):
error=0
for c in list_categories:
if c not in category_l:
print("Error! Change this category name: %s. Unfortunatly this one is not in the currently available" % c)
print("Go to the previous cell and change the name of this category, and run it again")
error = 1
if error == 0:
print("Ok! You are going to study %s" % list_categories)
def dataframe_groupby_size(df, column_list, rename, level=None, city=None, period="full"):
if len(column_list)==1:
column=column_list[0]
df_v1=df.groupby([column]).size().reset_index()
df_v1=df_v1.rename(columns={0: rename}).sort_values(by=[rename], ascending=False)
if city:
title= level+" " + rename+ " per "+ column + " at " +city
if period!= "full":
title = title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column,y=rename, color=rename, size=rename, size_max=50, title=title)
fig_bar= px.bar(df_v1, x=column, y=rename, color=column, barmode='group', title=title)
else:
title= level+" " + rename+ " per "+ column
if period != "full":
title = title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column,y=rename, color=rename, size=rename, size_max=50, title=title)
fig_bar= px.bar(df_v1, x=column, y=rename, color=column, barmode='group', title=title)
return df_v1, fig_scatter, fig_bar
else:
df_v1=df.groupby(column_list).size().reset_index()
df_v1=df_v1.rename(columns={0: rename})
return df_v1
def dataframe_groupby_sum(df, list_column, column2, level, rename=None, city=None, period="full"):
if len(list_column)==1:
column1=list_column[0]
df_v1=df.groupby([column1])[column2].sum().reset_index()
if not rename:
df_v1=df_v1.sort_values(by=[column2], ascending=False)
title=level+" " + column2+ " per "+ column1
if period !="full":
title=title + " for the month of " + month_string + " over the years"
fig_scatter=px.scatter(df_v1, x=column1,y=column2, color=column2, size=column2, size_max=50, title=title )
fig_bar= px.bar(df_v1, x=column1, y=column2, color=column1, barmode='group', title=title)
return df_v1, fig_scatter, fig_bar
else:
title=level+" " + rename+ " per "+ column1 + " at " + city
if period != "full":
title=title + " for the month of " + month_string + " over the years"
df_v1=df_v1.rename(columns={column2: rename}).sort_values(by=[rename], ascending=False).reset_index()
fig_bar = px.bar(df_v1, x=column1, y=rename, color=column1, barmode='group', title= title)
return df_v1, fig_bar
else:
df_v1=df.groupby(list_column)[column2].sum().reset_index()
df_v1=df_v1.rename(columns={column2: rename}).sort_values(by=[rename], ascending=False).reset_index()
return df_v1
def fig_histogram_color(df, x, column, level, city, y=None, period="full"):
if not y:
title= "Histogram of " + level.lower() + " per "+ column + " at " + city
if period !="full":
title= title + " for the month of " + month_string + " over the years"
fig=px.histogram(df, x=x, color=column, title=title, nbins=72)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
else:
title= "Histogram of "+ level.lower()+ " " + y + " per "+ column + " at " + city
if period != "full":
title= title + " for the month of " + month_string + " over the years"
#nbins=12*num_years -->12*6
fig= px.histogram(df, x=x,y=y, color=column, title=title, nbins=72)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
def fig_histogram_subtype(df, x, column, y, category, level, city, period='full'):
df_v1= df[df[column].isin([category])].sort_values(by=[x], ascending=True).reset_index()
title="Histogram of "+ category + " " + level.lower() + " " + y + " at " + city
if period!="full":
title=title + " for the month of " + month_string + " over the years"
#nbins=12*num_years --> 12*6
fig=px.histogram(df_v1, x=x,y=y, title=title, nbins=72)
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
return fig
def clean_documents(text):
text1 = re.sub(r'\S*@\S*\s?', '', text, flags=re.MULTILINE) # remove email
text1 = re.sub(r'http\S+', '', text1, flags=re.MULTILINE) # remove web addresses
text1 = re.sub("\'", "", text1) # remove single quotes
text2 = remove_stopwords(text1)
text2= re.sub('\s+(a|an|and|the|The|A|An|And)(\s+)', '\2', text2) # removes the whitespace
return text2
def map_tag_places(df, tag_name):
df2=df.loc[(df['event_tags'] == tag_name)]
df3=df2.dropna(subset=['loc'])
df3=pd.concat([df3.drop(['loc'], axis=1), df3['loc'].apply(pd.Series)], axis=1)
df_tag_city= df3.groupby(['event_id', 'event_name', 'category', 'start_ts', 'end_ts', 'ts', 'town', 'place_id', 'name', 'latitude', 'longitude']).size().reset_index()
df_tag_city=df_tag_city.rename(columns={0: "number_of_perf"}).sort_values(by=['number_of_perf'], ascending=False)
df_tag_city[['start_ts','end_ts', 'ts']] = df_tag_city[['start_ts','end_ts', 'ts']].apply(pd.to_datetime)
df_tag_city['latitude'] = df_tag_city['latitude'].astype(float)
df_tag_city['longitude'] = df_tag_city['longitude'].astype(float)
df_tag_city=df_tag_city.round(8)
df_tag_city['ts'] = pd.to_datetime(df_tag_city['ts'], utc=True)
df_tag_city=df_tag_city.sort_values(by=['ts'], ascending=True)
##df_perf_loc_city["year"]=pd.DatetimeIndex(pd.to_datetime(df_perf_loc_city['ts'], utc=True)).year
df_tag_city["month"]=pd.to_datetime(df_tag_city['ts'], utc=True).dt.strftime('%m-%Y')
fig1=px.scatter_mapbox(df_tag_city, lat="latitude", lon="longitude", animation_frame="month",
title= tag_name + " Tagged Performances Places in "+ city +" from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
df_tag_count=df_tag_city.groupby(['latitude','longitude', 'month']).count().reset_index()
df_tag_count=df_tag_count.rename(columns={"event_id": "frequency"})
df_tag_count=df_tag_count.round(8)
df_tag_count["name"]=None
df_tag_count["place_id"]=None
for index, row in df_tag_count.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=df_tag_city[(df_tag_city.latitude == lat) & (df_tag_city.longitude == long)]['place_id'].head(1).iloc[0]
place_name=df_tag_city[(df_tag_city.latitude == lat) & (df_tag_city.longitude == long)]['name'].head(1).iloc[0]
df_tag_count.at[index, 'name']=place_name
df_tag_count.at[index, 'place_id']=place_id
df_tag_count['month'] = pd.to_datetime(df_tag_count['month'], utc=True)
df_tag_count=df_tag_count.sort_values(by=['month'], ascending=True)
df_tag_count["month"]=pd.to_datetime(df_tag_count['month'], utc=True).dt.strftime('%m-%Y')
max_freq=df_tag_count["frequency"].max()
fig2=px.scatter_mapbox(df_tag_count, lat="latitude", lon="longitude", animation_frame="month",
color="frequency", size="frequency",
range_color=(0,max_freq),
title=tag_name + " Tagged Performances Places Frequency in "+ city + " from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=30, zoom=10, hover_name="name")
return df_tag_city, fig1, fig2
The original dataset are 9 files, which are snapshots of a serie events and places during a period of 6 months each:
And we "transformed" them into a serie of datataframes using the Generating_List_DataFrames notebook.
Important: An event can be present across more than one snaptshots (this means that is present in more than one file). To deal with this issue, we had "merged" together the information of the repeated events. During the merging phase, we got rid off of the repeated information, while preseving the parts that are not repeated (e.g. schedules, tags).
Furthermore, we have also created three new columns at event level:
Those are initilliazed as follows:
df_places = pd.read_pickle(dataframe_path+"/df_places")
df_events = pd.read_pickle(dataframe_path+"/df_new_events")
df_places[0:2]
| address | postal_code | properties | sort_name | town | website | place_id | modified_ts | created_ts | name | loc | country_code | tags | descriptions | phone_numbers | status | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 York Place | admin@thestand.co.uk | EH1 3EB | {'place.child-restrictions': True, 'place.faci... | Stand | Edinburgh | http://www.thestand.co.uk | 1 | 2021-11-24T12:18:33Z | 2021-11-24T12:18:33Z | The Stand | {'latitude': '55.955806109395006', 'longitude'... | GB | [Bar & pub food, Comedy, Restaurants, Venues] | [{'type': 'description.list.default', 'descrip... | {'info': '0131 558 7272', 'box_office': '0131 ... | live |
| 1 | 10 Orwell Terrace | NaN | EH11 2DY | NaN | St Bride's Centre | Edinburgh | http://stbrides.wordpress.com | 371 | 2019-12-04T13:27:26Z | 2019-12-04T13:27:26Z | St Bride's Centre | {'latitude': '55.94255035', 'longitude': '-3.2... | GB | [Cinemas, Community centre, Public buildings, ... | [{'type': 'description.list.default', 'descrip... | {'info': '0131 346 1405'} | live |
df_events[0:2]
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | category | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 157884 | 2021-01-13T18:46:26Z | 2007-12-06T17:18:12Z | Väsen | Väsen | live | 157884 | [{'start_ts': '2018-04-26T20:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.twoforjoy.co.uk | ... | Music | {'list:website:comments-end-date': '2013-01-31... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN |
| 1 | 194419 | 2022-02-02T01:03:54Z | 2010-01-25T14:51:46Z | Martin Simpson | Martin Simpson | live | 194419 | [{'start_ts': '2018-03-10T19:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.martinsimpson.com/ | ... | Music | {'list:website:comments-end-date': '2020-01-28... | 2 | 1 | sample_20171101.json, sample_20180501.json | 2017-11-01 | 2018-11-01 | 2 | NaN | NaN |
2 rows × 21 columns
### These are important for selecting in the next section the configuration parameters
city_l=list(df_places.town.unique())
city_l2=[]
for cl in city_l:
city_l2.append(str(cl))
category_l=list(df_events.category.unique())
cities_available=', '.join(city_l2)
categories_available=', '.join(category_l)
IMPORTANT You need to indicate the following 3 parameters.
These allows you to select the city, the list of categories to study, and the specific month to perform further analyses.
Bellow you have the list of cities and categories available.
### This list the cities available
cities_available
'Edinburgh, Musselburgh, Jedburgh, Dunfermline, Peebles, St Andrews, Kirkcaldy, Livingston, Glenrothes, Bathgate, Galashiels, Lochgelly, South Queensferry, North Berwick, Innerleithen, Kelso, Linlithgow, Peeblesshire, Dalkeith, Gorebridge, Cupar, Falkland, Melrose, Aberdour, Haddington, Crail, Ceres, Longniddry, Fife, Prestonpans, , Selkirk, Newtongrange, Port Seton, Denholm, Penicuik, Anstruther, Hawick, Berwick-upon-Tweed, Dalgety Bay, Cardenden, Lasswade, Markinch, Aberlady, Newport on Tay, Broxburn, Luthrie, Auchtermuchty, Duns, Coldstream, Tranent, Balerno, Elie, Buckhaven, Leith, Wilkieston, Dunbar, Methil, Kinghorn, Collessie, East Linton, St Boswells, Dirleton, Cockenzie, Pathhead, Burntisland, West Linton, Colinsburgh, Chirnside, Kirkliston, Hutton, Lauder, Gullane, Limekilns, East Calder, West Calder, Ormiston, St Abbs, Humbie, Newburgh, Wormit, Bonchester Bridge, Kelty, Pencaitland, Loanhead, Fauldhouse, Saline, Pitscottie, Lothianburn, Crossford, Leven, Eyemouth, Newport-on-Tay, Ladybank, Ayton, Armadale, Clovenfords, Livingston village, Wallyford, St Monans, Stenton, Newport-On-Tay, Dunshalt, Ednam, Leslie, Freuchie, Howgate, Whittingehame, Kingsbarns, Heiton by Kelso, Cowdenbeath, Easthouses, by St Andrews, St. Andrews, High Valleyfield, Culross, Thornton, Inverkeithing, Pittenweem, Coldingham, Newcastleton, East Lothian, North Queensferry, nr Dunbar, nan, Tayport, Oakley, Kennoway, Blackridge, Kincardine, Glentress, East Neuk, Carlops, Valleyfield, Rosyth, Currie, Ballingry, Leuchars, Gattonside, Eddleston, Strathmiglo, Yetholm, Bonnyrigg, Blebo Craigs, Roslin, Juniper Green, Craigrothie, Cranshaws, Biggar, Berwickshire, Newport, Spott Village, Pumpherston, Balmullo, Gifford, Inveresk, Whitburn, EH8 8BL, Juniper Geren, Dairsie, Newport-on-tay, St. Monans, Scottish Borders, Rosewell, Luffness, Nr Edinburgh, West Lothian, Bowden, Coaltown of Balgonie, Edinburgh & Glasgow, Burnmouth, Torpichen, Newtown St Boswells, Gordon, Forgan, Broughton, Earlston, West Barns, Temple, By Cupar, Charlestown, Guardbridge, Dysart, Lade Braes, Kirkaldy, Howden, Smailholm, Glasgow, Dalmeny, Stow, Danderhall, Torphichen, Conifox Adventure Park, Saint Monans, Northampton, Shanks Road, Cockenzie & Port Seton, Tbc, Dumfermline, Winchburgh, Whitsome, By Collessie, East Wemyss, Stobo'
### This list the categories available
categories_available
'Music, Comedy, Theatre, Days out, Other, Clubs, Sport, Festival, Books, Film, Dance, Talks & Lectures, Kids, LGBT, Visual art'
### Introduce here the name of the city that you want to study
city="St Andrews"
### Checking if the city has been selected correctly
check_city(city)
Ok! You are going to study St Andrews
### Introduce here the list of categories
list_categories=["Music", "Visual art", "Film", "Days out", "Books", "Comedy", "Theatre", "Kids", "LGBT", "Festival", "Sport"]
### Checking if the categories have been selected correctly
check_categories(list_categories)
Ok! You are going to study ['Music', 'Visual art', 'Film', 'Days out', 'Books', 'Comedy', 'Theatre', 'Kids', 'LGBT', 'Festival', 'Sport']
### Introduce the number of the month that you want to explore further - e.g month of a festival.
month=3
if month < 0 or month > 12:
print("Go the previous cell, and change the number of the month")
You do not need to do anything in the next cell, just run it.
### do not change nothing here
month_string=month_to_string(month)
if city=="St Andrews":
city_sort_name="sta"
else:
city_sort_name=city.lower().replace(" ","")
Important: At event level we dont have information of the places. So we can not filter events by city, neither by scheduling start/end times. But we do have the information to which snapshot period (period start and period end) each event belong to, along with the frequency of the repetition of each event across periods.
df_events["tags"]
0 [Folk, Music]
1 [Blues, Jazz, Folk, Music, Folk & world]
2 [Burns, Burns Night, Comedy, Stand-up]
3 [Stand-up, Red Raw, Comedy]
4 [The Saturday Show, Stand-up, Comedy]
...
2272 [Motorsport, Sport]
2276 [Activities, Days out, History, Nature, Storyt...
2278 [Days out, History, Tours, Walking tour, Walks]
2279 [Activities, Days out, Food & Drink]
2280 [Days out, Food & Drink]
Name: tags, Length: 38700, dtype: object
df_tags=df_events.explode('tags')
df_tags.iloc[0]
df_tags=pd.concat([df_tags.drop(['tags'], axis=1), df_tags['tags'].apply(pd.Series)], axis=1)
df_tags.rename(columns={0:'tags'}, inplace=True)
df_tags.dropna(subset=['tags'])
df_tags.head(3)
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | tags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 157884 | 2021-01-13T18:46:26Z | 2007-12-06T17:18:12Z | Väsen | Väsen | live | 157884 | [{'start_ts': '2018-04-26T20:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.twoforjoy.co.uk | ... | {'list:website:comments-end-date': '2013-01-31... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Folk |
| 0 | 157884 | 2021-01-13T18:46:26Z | 2007-12-06T17:18:12Z | Väsen | Väsen | live | 157884 | [{'start_ts': '2018-04-26T20:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.twoforjoy.co.uk | ... | {'list:website:comments-end-date': '2013-01-31... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Music |
| 1 | 194419 | 2022-02-02T01:03:54Z | 2010-01-25T14:51:46Z | Martin Simpson | Martin Simpson | live | 194419 | [{'start_ts': '2018-03-10T19:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.martinsimpson.com/ | ... | {'list:website:comments-end-date': '2020-01-28... | 2 | 1 | sample_20171101.json, sample_20180501.json | 2017-11-01 | 2018-11-01 | 2 | NaN | NaN | Blues |
3 rows × 21 columns
We will save the ids of events tagged as golf - to check later in Section 2.5, which of those are in this particular city
len(df_tags[df_tags["tags"]=="Golf"])
14
df_tags[df_tags["tags"]=="Golf"]
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | tags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 386 | 596787 | 2018-01-05T08:55:29Z | 2016-05-25T10:22:39Z | Tommy's Honour | Tommy's Honour | live | 596787 | [{'start_ts': '2017-11-30T14:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Golf |
| 4508 | 931952 | 2018-02-06T13:34:04Z | 2018-02-05T11:37:11Z | Valentine's Night Dinner | Valentine's Night Dinner | live | 931952 | [{'start_ts': '2018-02-10T19:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Golf |
| 2441 | 166804 | 2022-01-10T17:25:01Z | 2008-07-21T12:43:40Z | The Senior Open Presented by Rolex | Senior Open Presented by Rolex | live | 166804 | [{'start_ts': '2018-07-26T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.europeantour.com/seniortour/senioro... | ... | {'list:website:comments-end-date': '2013-07-24... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 2442 | 1019328 | 2018-05-31T13:21:56Z | 2018-05-31T13:21:22Z | The Senior Open Championship at The Old Course | Senior Open Championship at The Old Course | live | 1019328 | [{'start_ts': '2018-07-26T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.senioropenchampionship.com/ | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 2771 | 682151 | 2016-11-29T16:11:10Z | 2016-11-29T14:43:27Z | The Junior Open Championship | Junior Open Championship | live | 682151 | [{'start_ts': '2018-07-16T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://championships.randa.org/en/AmateurChamp... | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 4532 | 145183 | 2018-09-24T14:51:18Z | 2007-03-28T18:32:21Z | World Hickory Open | World Hickory Open | live | 145183 | [{'start_ts': '2018-10-15T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.worldhickoryopen.com | ... | {'list:website:comments-end-date': '2013-01-31... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 4533 | 985897 | 2019-03-06T12:34:03Z | 2018-04-26T17:23:26Z | Aberdeen Standard Investments Scottish Open | Aberdeen Standard Investments Scottish Open | live | 985897 | [{'start_ts': '2018-07-11T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | https://www.asiscottishopen.com/ | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json, sample_20190501.json | 2018-05-01 | 2019-11-01 | 2 | NaN | NaN | Golf |
| 4534 | 986684 | 2019-03-06T15:13:41Z | 2018-04-27T11:10:42Z | Aberdeen Standard Investment Ladies Scottish O... | Aberdeen Standard Investment Ladies Scottish O... | live | 986684 | [{'start_ts': '2018-07-26T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | https://ladiesscottishopen.com/ | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json, sample_20190501.json | 2018-05-01 | 2019-11-01 | 2 | NaN | NaN | Golf |
| 5030 | 986685 | 2018-04-27T16:29:19Z | 2018-04-27T11:12:46Z | Staysure Tour Scottish Senior Open 2018 | Staysure Tour Scottish Senior Open 2018 | live | 986685 | [{'start_ts': '2018-09-14T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.europeantour.com/staysuretour/ | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 8108 | 1043499 | 2018-07-06T16:40:34Z | 2018-07-06T12:09:16Z | Aberdeen Standard Investments Scottish Open Qu... | Aberdeen Standard Investments Scottish Open Qu... | live | 1043499 | [{'start_ts': '2018-07-07T00:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | https://pgagbi.bluegolf.com/bluegolf/pgagbi18/... | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20180501.json | 2018-05-01 | 2018-11-01 | 1 | NaN | NaN | Golf |
| 4705 | 688477 | 2019-01-31T13:15:24Z | 2016-12-16T09:43:36Z | The Captain David Seath Memorial Fund Golf Day | Captain David Seath Memorial Fund Golf Day | live | 688477 | [{'start_ts': '2019-09-06T08:30:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20190501.json | 2019-05-01 | 2019-11-01 | 1 | {'info': '07795 538147'} | NaN | Golf |
| 9608 | 1198157 | 2020-07-29T01:44:32Z | 2020-07-29T01:44:32Z | Guided Walks of the Old Course | Guided Walks of the Old Course | live | 1198157 | [{'start_ts': '2019-05-12T11:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 1 | sample_20190501.json | 2019-05-01 | 2019-11-01 | 1 | NaN | NaN | Golf |
| 11029 | 1436517 | 2019-10-22T17:26:17Z | 2019-10-22T15:44:23Z | Scrapheap Golf | Scrapheap Golf | live | 1436517 | [{'start_ts': '2019-10-25T12:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 1 | sample_20190501.json, sample_20191101.json, sa... | 2019-05-01 | 2020-11-01 | 3 | NaN | NaN | Golf |
| 926 | 1725762 | 2021-10-09T06:21:43Z | 2021-09-30T23:08:25Z | The Phantom of the Open | Phantom of the Open | live | 1725762 | [{'start_ts': '2021-10-13T20:30:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20210501.json | 2021-05-01 | 2021-11-01 | 1 | NaN | NaN | Golf |
14 rows × 21 columns
### This will be used later in the next section
golf_events_id=df_tags[df_tags["tags"]=="Golf"]["event_id"].to_list()
golf_events_id
[596787, 931952, 166804, 1019328, 682151, 145183, 985897, 986684, 986685, 1043499, 688477, 1198157, 1436517, 1725762]
df_tags.loc[df_tags['tags'] == 'Sport']
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | tags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 75 | 890174 | 2017-11-21T16:05:35Z | 2017-11-21T11:47:41Z | ARRGmageddon Roller Derby Tournament | ARRGmageddon Roller Derby Tournament | live | 890174 | [{'start_ts': '2017-11-25T11:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.arrg.co.uk | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Sport |
| 80 | 856568 | 2017-10-03T03:25:58Z | 2017-10-03T03:25:58Z | Risk v Reward: Taylor v Vazquez | Risk v Reward: Taylor v Vazquez | live | 856568 | [{'start_ts': '2017-11-11T17:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Sport |
| 566 | 343099 | 2022-01-22T01:54:23Z | 2013-01-29T16:51:42Z | American Wrestling: W3L Wrestling Showdown | American Wrestling: W3L Wrestling Showdown | live | 343099 | [{'start_ts': '2018-02-25T13:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.w3lwrestling.com | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json, sample_20180501.json, sa... | 2017-11-01 | 2022-05-01 | 7 | NaN | NaN | Sport |
| 608 | 849636 | 2021-09-30T11:41:08Z | 2017-09-12T09:36:28Z | Battle of the Sexes | Battle of the Sexes | live | 849636 | [{'start_ts': '2017-11-24T13:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json, sample_20180501.json | 2017-11-01 | 2018-11-01 | 2 | NaN | NaN | Sport |
| 837 | 49556 | 2020-10-31T06:22:58Z | 2006-05-21T21:29:26Z | Rocky | Rocky | live | 49556 | [{'start_ts': '2018-03-13T18:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json, sample_20200501.json | 2017-11-01 | 2020-11-01 | 2 | NaN | NaN | Sport |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1866 | 1583719 | 2020-07-27T12:00:01Z | 2020-07-20T10:39:37Z | Kids' Summer Tennis Fun | Kids' Summer Tennis Fun | live | 1583719 | [{'start_ts': '2020-08-03T09:00:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | {'info': '07803598598'} | NaN | Sport |
| 2112 | 1542024 | 2020-03-01T13:14:54Z | 2020-03-01T13:14:54Z | Borders Disability Sport Awards Evening 2020 | Borders Disability Sport Awards Evening 2020 | live | 1542024 | [{'start_ts': '2020-05-22T19:30:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | {'list:importance': 'ls'} | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Sport |
| 2238 | 1569563 | 2020-05-03T02:11:52Z | 2020-05-03T02:11:52Z | #ourecfc Spot The Ball Game (Win Prizes - Supp... | #ourecfc Spot The Ball Game (Win Prizes - Supp... | live | 1569563 | [{'start_ts': '2020-05-02T12:00:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | {'list:importance': 'ls'} | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Sport |
| 2249 | 1572955 | 2020-06-07T01:30:21Z | 2020-05-23T01:33:32Z | Skatetrain 121 All-Terrain Roller Skate Training | Skatetrain 121 All-Terrain Roller Skate Training | live | 1572955 | [{'start_ts': '2020-06-06T11:00:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | {'list:importance': 'ls'} | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Sport |
| 2272 | 1584208 | 2020-07-28T12:14:54Z | 2020-07-22T16:39:58Z | Micro Banger World Cup | Micro Banger World Cup | live | 1584208 | [{'start_ts': '2020-10-10T16:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Sport |
663 rows × 21 columns
df_tags.loc[df_tags['tags'] == 'Kids']
| event_id | modified_ts | created_ts | name | sort_name | status | id | schedules | descriptions | website | ... | properties | ranking_level | ranking_in_level | source_file | period_start | period_end | frequency_event | phone_numbers | alternative_names | tags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | 767734 | 2018-09-16T12:07:12Z | 2018-09-16T12:07:12Z | Comedy Kids | Comedy Kids | live | 767734 | [{'start_ts': '2017-11-25T14:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'expected_visit_durati... | 3 | 2 | sample_20171101.json, sample_20180501.json, sa... | 2017-11-01 | 2019-05-01 | 3 | NaN | NaN | Kids |
| 67 | 192377 | 2019-08-27T15:20:34Z | 2009-12-01T15:04:16Z | Scots Music Group Ceilidh with Robert Fish Band | Scots Music Group Ceilidh with Robert Fish Band | live | 192377 | [{'start_ts': '2017-11-25T20:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.scotsmusic.org/ceilidhs/ | ... | {'list:website:comments-end-date': '2019-12-03... | 3 | 2 | sample_20171101.json, sample_20180501.json, sa... | 2017-11-01 | 2020-05-01 | 5 | {'info': '0131 555 7668'} | NaN | Kids |
| 72 | 237197 | 2019-10-24T13:19:08Z | 2011-10-04T16:58:52Z | Meadowbank Annual Fireworks Display | Meadowbank Annual Fireworks Display | live | 237197 | [{'start_ts': '2017-11-05T19:30:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | http://www.edinburghleisure.co.uk/news/firewor... | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | {'box_office': '0844 8700 887'} | NaN | Kids |
| 107 | 729369 | 2018-01-29T19:20:47Z | 2018-01-29T19:20:47Z | The Chimpanzees of Happy Town | Chimpanzees of Happy Town | live | 729369 | [{'start_ts': '2018-02-17T12:00:00+00:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Kids |
| 145 | 943017 | 2018-02-22T11:46:28Z | 2018-02-22T11:46:28Z | Science Festival Church Service | Science Festival Church Service | live | 943017 | [{'start_ts': '2018-04-08T11:30:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | [] | 3 | 2 | sample_20171101.json | 2017-11-01 | 2018-05-01 | 1 | NaN | NaN | Kids |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2168 | 1521018 | 2020-07-14T16:57:35Z | 2020-02-07T14:00:01Z | Children's Haunted Underground Experience | Children's Haunted Underground Experience | live | 1521018 | [{'start_ts': '2020-08-07T13:30:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'list:importance': 'll'} | 3 | 1 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Kids |
| 2181 | 1559339 | 2020-07-14T16:57:35Z | 2020-03-19T20:00:40Z | Fruit Flies Like a Banana: Kids! | Fruit Flies Like a Banana: Kids! | live | 1559339 | [{'start_ts': '2020-08-05T10:45:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'list:importance': 'll'} | 3 | 1 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Kids |
| 2202 | 1574725 | 2020-06-03T10:57:42Z | 2020-06-01T18:31:04Z | Young Historians Family Walking Tour | Young Historians Family Walking Tour | live | 1574725 | [{'start_ts': '2020-07-01T00:00:00+01:00', 'en... | [{'type': 'description.official', 'description... | NaN | ... | {'list:importance': 'll'} | 3 | 1 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | {'info': '0131 235 2351'} | NaN | Kids |
| 2211 | 1401539 | 2019-09-12T17:04:11Z | 2019-09-11T11:36:36Z | Silverburn Park Music Festival | Silverburn Park Music Festival | live | 1401539 | [{'start_ts': '2020-07-25T12:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 1 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Kids |
| 2215 | 1602269 | 2020-10-07T16:34:37Z | 2020-10-05T16:01:06Z | Halloween Pumpkin Carving at The Cauldron | Halloween Pumpkin Carving at The Cauldron | live | 1602269 | [{'start_ts': '2020-10-17T11:00:00+01:00', 'en... | [{'type': 'description.list.default', 'descrip... | NaN | ... | {'dropin_event': False, 'booking_essential': F... | 3 | 2 | sample_20200501.json | 2020-05-01 | 2020-11-01 | 1 | NaN | NaN | Kids |
3552 rows × 21 columns
df_tags.loc[df_tags['event_id'] == 22066]["tags"]
3524 Fantasy 3524 Ghostbusters 3524 Film 3524 Comedy Name: tags, dtype: object
list_tags=df_tags['tags'].unique()
print("Number of unique tags %s" %(len(list_tags)))
Number of unique tags 2175
## Print the first 30 tags
for i in range(0, 30):
print(list_tags[i])
Folk Music Blues Jazz Folk & world Burns Burns Night Comedy Stand-up Red Raw The Saturday Show Sunday Night Laugh-In Stand-Up Sketch Improv Improvisational Kids Musical Politics Play Theatre Storytelling Talks & Lectures Robin Ince Food & Drink Cookery Drama Tribute Act Eric Clapton Rock & Pop
g_tags=dataframe_groupby_size(df_tags, ['tags','category'], 'frequency', 'Events')
g_tags=g_tags.sort_values(by=['frequency'], ascending=False)
g_tags
| tags | category | frequency | |
|---|---|---|---|
| 2123 | Music | Music | 8590 |
| 887 | Days out | Days out | 4733 |
| 703 | Comedy | Comedy | 4683 |
| 371 | Books | Books | 4467 |
| 3304 | Theatre | Theatre | 4246 |
| ... | ... | ... | ... |
| 1672 | Ice Skating | Dance | 1 |
| 1674 | Ice Skating | Film | 1 |
| 1675 | Ice skating | Days out | 1 |
| 1677 | Illustration | Books | 1 |
| 3747 | workshops | Days out | 1 |
3748 rows × 3 columns
g_tags.loc[g_tags['tags'] == 'Sport']
| tags | category | frequency | |
|---|---|---|---|
| 3023 | Sport | Sport | 401 |
| 3020 | Sport | Days out | 186 |
| 3021 | Sport | Film | 27 |
| 3024 | Sport | Talks & Lectures | 18 |
| 3022 | Sport | Kids | 13 |
| 3016 | Sport | Books | 8 |
| 3019 | Sport | Dance | 5 |
| 3018 | Sport | Comedy | 4 |
| 3017 | Sport | Clubs | 1 |
g_tags.loc[g_tags['tags'] == 'Golf']
| tags | category | frequency | |
|---|---|---|---|
| 1507 | Golf | Days out | 9 |
| 1509 | Golf | Sport | 3 |
| 1508 | Golf | Film | 2 |
g_tags.loc[g_tags['tags'] == 'Festival']
| tags | category | frequency | |
|---|---|---|---|
| 1259 | Festival | Days out | 202 |
| 1255 | Festival | Books | 134 |
| 1260 | Festival | Festival | 88 |
| 1264 | Festival | Music | 59 |
| 1261 | Festival | Film | 31 |
| 1266 | Festival | Talks & Lectures | 25 |
| 1256 | Festival | Clubs | 22 |
| 1258 | Festival | Dance | 17 |
| 1262 | Festival | Kids | 14 |
| 1257 | Festival | Comedy | 13 |
| 1268 | Festival | Visual art | 8 |
| 1267 | Festival | Theatre | 6 |
| 1265 | Festival | Sport | 4 |
| 1263 | Festival | LGBT | 1 |
fig_scatter=px.scatter(g_tags, x='tags',y='frequency', color='category', size='frequency', size_max=50, title="Frequency of tags events per category" )
fig_scatter.show()
g_tags_v1=g_tags.loc[(g_tags['tags'] == 'Kids') | (g_tags["tags"]=='LGBT')]
fig_scatter=px.scatter(g_tags_v1, x='category',y='frequency', color='tags', size='frequency', size_max=50, title="Frequency of Events tagged as Kids and LGBT per category" )
fig_scatter.show()
g_tags_v2, fig_scatter, fig_bar=dataframe_groupby_size(df_tags, ['tags'], 'frequency', 'Events')
fig_scatter.show()
g_category, fig_scatter, fig_bar=dataframe_groupby_size(df_events, ['category'], 'frequency', 'Events')
fig_scatter.show()
fig_bar.show()
We are going to calculate the frequency of each category TAKE INTO ACCOUNT the frequency in which an event is repeated across periods.
g_category_v2, fig_scatter, fig_bar=dataframe_groupby_sum(df_events, ['category'], 'frequency_event', 'Events')
fig_scatter.show()
fig_bar.show()
Remember: An event has two columns with information about its snapshots period of times
Therefore, our events can start and ends at different periods of time - some examples are here:
g_category_start =dataframe_groupby_size(df_events, ['category', 'period_start'], 'frequency', 'Events')
g_category_start=g_category_start.sort_values(by=['frequency'], ascending=False)
px.scatter(g_category_start, x="category",y='period_start', color='frequency', size="frequency", size_max=60, title="Events frequency per category starting at the same period of time")
g_category_end =dataframe_groupby_size(df_events, ['category', 'period_end'], 'frequency', 'Events')
g_category_end=g_category_end.sort_values(by=['frequency'], ascending=False)
px.scatter(g_category_end, x="category",y='period_end', color='frequency', size="frequency", size_max=60, title="Events frequency per category ending at the same period of time")
fig = px.timeline(df_events, x_start="period_start", x_end="period_end", y="category", color="category")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.update_layout(title="Gantt chart of events grouped by categories in different period of time")
fig.update_xaxes(dtick="M2", ticklabelmode="period")
fig.show()
We have pre-calculated the revenue at the schedule level. Originally this information was not available.
1 Event can have 1 to N Schedules.
1 Schedule is in 1 Place
1 Schedule can have 1 to N Performances
1 Peformance can have 1 to N Tickets
1 Ticket has a max_price, min_price, currency.
df_schedule_revenue = pd.read_pickle(dataframe_path+"/df_schedule_revenue")
## Filter by city schedules
df_schedule_revenue_city=df_schedule_revenue[df_schedule_revenue['town'].isin([city])]
len(df_schedule_revenue)
30765
df_schedule_revenue_city[df_schedule_revenue_city["category"]=="Sport"]
| event_id | event_name | category | place_id | start_ts | end_ts | town | place.capacity.max | name | t_revenue_trim_mean | s_revenue | num_performance | num_days | num_perf_per_day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 14846 | 1089256 | Orlando | Sport | 14738 | 2019-04-03 18:30:00+00:00 | 2019-04-03 19:30:00+01:00 | St Andrews | 216.0 | Byre Theatre | 15.0 | 2268.0 | 1 | 1 | 1.0 |
### df_schedule_revenue_city_festival : Just selecting the schedules starting at a particular month across years.
tz_info = df_schedule_revenue_city.iloc[0].start_ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_schedule_revenue_city_festival=df_schedule_revenue_city[(df_schedule_revenue_city['start_ts']>= date1) \
& (df_schedule_revenue_city['start_ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_schedule_revenue_city[(df_schedule_revenue_city['start_ts']>= date1) \
& (df_schedule_revenue_city['start_ts'] < date2)]
df_schedule_revenue_city_festival=pd.concat([df_schedule_revenue_city_festival, tmp], axis=0)
len(df_schedule_revenue_city_festival)
250
df_s_category, fig_scatter, fig_bar=dataframe_groupby_size(df_schedule_revenue_city, ['category'], 'frequency', 'Schedules', city)
fig_bar.show()
df_s_category
| category | frequency | |
|---|---|---|
| 0 | Books | 357 |
| 4 | Film | 282 |
| 6 | Music | 174 |
| 9 | Talks & Lectures | 107 |
| 10 | Theatre | 105 |
| 3 | Days out | 78 |
| 7 | Other | 74 |
| 5 | Kids | 40 |
| 1 | Comedy | 38 |
| 2 | Dance | 34 |
| 11 | Visual art | 11 |
| 8 | Sport | 1 |
df_s_category_festival, fig_scatter, fig_bar=dataframe_groupby_size(df_schedule_revenue_city_festival, ['category'], 'frequency', 'Schedules', city, period="Month")
fig_bar.show()
fig_his=fig_histogram_color(df_schedule_revenue_city, "start_ts", "category", "Schedules", city)
fig_his.show()
df_s_city_category= dataframe_groupby_size(df_schedule_revenue_city, ['category', 'start_ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_city_category, "start_ts", "category", "frequency", cat, "schedules", city)
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city_festival, "start_ts", "category", "Schedules", city)
fig_his.show()
df_s_city_category_festival= dataframe_groupby_size(df_schedule_revenue_city_festival, ['category', 'start_ts'] , "frequency", period="Month")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_city_category_festival, "start_ts", "category", "frequency", cat, "schedules", city, period="Month")
fig.show()
df_s_revenue, fig=dataframe_groupby_sum(df_schedule_revenue_city, ['category'], 's_revenue', 'schedules', 'revenue', city)
fig.show()
df_s_revenue_festival, fig=dataframe_groupby_sum(df_schedule_revenue_city_festival, ['category'], 's_revenue', 'schedules', 'revenue', city, period="Month")
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city, "start_ts", "category", "schedules", city, "s_revenue")
fig_his.show()
fig=px.histogram(df_schedule_revenue_city, x="start_ts", y="s_revenue", histfunc="sum", title="Monthly schedules revenue at " + city)
fig.update_traces(xbins_size="M1")
fig.update_xaxes(
dtick="M1",
ticklabelmode="period")
fig.show()
df_s_revenue_category=dataframe_groupby_sum(df_schedule_revenue_city, ['category', 'start_ts'], 's_revenue', 'schedules', 'revenue', city)
for cat in list_categories:
fig=fig_histogram_subtype(df_s_revenue_category, "start_ts", "category", "revenue", cat, "schedules", city)
fig.show()
fig_his=fig_histogram_color(df_schedule_revenue_city_festival, "start_ts", "category", "schedules", city, "s_revenue", period="Month")
fig_his.show()
df_s_revenue_category_festival=dataframe_groupby_sum(df_schedule_revenue_city_festival, ['category', 'start_ts'], 's_revenue', 'schedules', 'revenue', city, period="Month")
for cat in list_categories:
fig=fig_histogram_subtype(df_s_revenue_category_festival, "start_ts", "category", "revenue", cat, "schedules", city, period="Month")
fig.show()
df_schedule_orig = pd.read_pickle(dataframe_path+"/df_s")
df_schedule_orig['start_ts'] = pd.to_datetime(df_schedule_orig['start_ts'])
df_schedule_orig_place = df_schedule_orig.merge(df_places, on=['place_id','place_id'])
### Filter by city schedules
df_schedule_orig_city=df_schedule_orig_place[df_schedule_orig_place['town'].isin([city])]
### df_schedule_orig_city_festival : Just selecting the schedules starting at a particular month across years.
tz_info = df_schedule_orig_city.iloc[0].start_ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_schedule_orig_city_festival=df_schedule_orig_city[(df_schedule_orig_city['start_ts']>= date1) \
& (df_schedule_orig_city['start_ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_schedule_orig_city[(df_schedule_orig_city['start_ts']>= date1) \
& (df_schedule_orig_city['start_ts'] < date2)]
df_schedule_orig_city_festival=pd.concat([df_schedule_orig_city_festival, tmp], axis=0)
len(df_schedule_orig_city_festival)
392
for event_id in golf_events_id:
if event_id in df_schedule_orig_city.event_id.values:
tag=df_schedule_orig_city[df_schedule_orig_city["event_id"]==event_id]["event_tags"].to_list()[0]
print(event_id, tag)
166804 ['Golf', 'Participation', 'Spectator', 'Sport'] 1019328 ['Days out', 'Golf', 'Sport'] 682151 ['Days out', 'Golf', 'Spectator', 'Sport'] 1198157 ['Days out', 'Golf', 'Sport']
df_s_subset=df_schedule_orig_city[["event_id", "event_tags", "category", "start_ts", "place_id", "town"]]
df_s_tags=df_s_subset.explode("event_tags")
df_s_tags=pd.concat([df_s_tags.drop(['event_tags'], axis=1), df_s_tags['event_tags'].apply(pd.Series)], axis=1)
df_s_tags.rename(columns={0:'event_tags'}, inplace=True)
df_s_tags
| event_id | category | start_ts | place_id | town | event_tags | |
|---|---|---|---|---|---|---|
| 3211 | 929587 | Comedy | 2018-05-05 20:00:00+01:00 | 14738 | St Andrews | Stand-up |
| 3211 | 929587 | Comedy | 2018-05-05 20:00:00+01:00 | 14738 | St Andrews | Comedy |
| 3212 | 239937 | Film | 2018-12-13 10:30:00+00:00 | 14738 | St Andrews | Comedy |
| 3212 | 239937 | Film | 2018-12-13 10:30:00+00:00 | 14738 | St Andrews | Drama |
| 3212 | 239937 | Film | 2018-12-13 10:30:00+00:00 | 14738 | St Andrews | Film |
| ... | ... | ... | ... | ... | ... | ... |
| 48570 | 1241533 | Books | 2019-03-16 10:00:00+00:00 | 116708 | St Andrews | Author events |
| 48570 | 1241533 | Books | 2019-03-16 10:00:00+00:00 | 116708 | St Andrews | Book launch |
| 48570 | 1241533 | Books | 2019-03-16 10:00:00+00:00 | 116708 | St Andrews | Book signing |
| 48570 | 1241533 | Books | 2019-03-16 10:00:00+00:00 | 116708 | St Andrews | Books |
| 48576 | 1679880 | Kids | 2021-07-16 00:00:00+01:00 | 50656 | St Andrews | Kids |
4153 rows × 6 columns
df_s_tags[df_s_tags['event_tags']=='Golf']
| event_id | category | start_ts | place_id | town | event_tags | |
|---|---|---|---|---|---|---|
| 41493 | 166804 | Sport | 2018-07-26 00:00:00+01:00 | 10180 | St Andrews | Golf |
| 41494 | 1019328 | Days out | 2018-07-26 00:00:00+01:00 | 10180 | St Andrews | Golf |
| 41850 | 682151 | Days out | 2018-07-16 00:00:00+01:00 | 10868 | St Andrews | Golf |
| 47275 | 1198157 | Days out | 2019-05-12 11:00:00+01:00 | 78931 | St Andrews | Golf |
total_tags=df_s_tags['event_tags'].unique()
print("Total of tags in %s: %s" %(city,len(total_tags)))
Total of tags in St Andrews: 292
print(total_tags)
['Stand-up' 'Comedy' 'Drama' 'Film' 'Christmas' 'Romance' 'Musical' 'Music' 'Classical' 'Covers Band' 'Folk' 'Tribute Act' 'Country' 'Horror' 'Simon and Garfunkel' 'Tribute' 'Rock & Pop' 'Family' 'Event Cinema' 'Theatre' 'Scottish Festivals' 'Kids' 'Play' 'Puppet Show' 'Plays' '60s' 'Concert' 'Covers bands' 'Sixties' 'Tribute acts' 'Bruce Springsteen' 'Pantomime' 'Opera' 'Action' 'Hercule Poirot' 'Crime' 'Thriller' 'Biography' 'Puppet shows' 'Jazz' 'Big band' 'Swing' 'ELO' '10CC' 'History' 'Heist' 'Fantasy' 'Adventure' 'Physical Theatre' 'Contemporary' 'Improv' 'Winnie-the-Pooh' 'Documentary' 'Arthouse' 'Days out' 'King Kong' 'MonsterVerse' 'Mystery' 'Books' 'Poetry' 'Book readings' 'Stanza' 'Foreigner' 'Rock & pop' 'Journey' 'StAnza' "Scotland's International Poetry Festival" 'Talks & Lectures' 'Dance' 'Visual art' 'Installation' 'Special Interest' 'Blues' 'Gospel' 'Soul' 'Storytelling' 'Aly Bain' 'Ceilidh' 'Phil Cunningham' 'Ballet' 'Family friendly' 'Child friendly' 'Bob Dylan' 'Frankenstein' 'Mary Shelley' 'Nature' 'The Beatles' 'Mamma Mia!' 'LGBT' 'Rock & Roll' 'Buddy Holly' 'Roy Orbison' 'SCO' 'Scottish Chamber Orchestra' 'Symphonic' 'Macbeth' 'Shakespeare' 'Indie' 'war' 'André Rieu' 'Sci-Fi' 'Animation' 'Disney' 'Activities' 'Festival' nan 'Mental health' 'Literature' 'Fundraiser' 'Sketch' 'Science' 'Variety' 'Queen' 'Celtic' 'Folk & world' 'Circus' 'Big Band' 'Solo guitar' 'Tina Turner' 'Metal' 'VisitScotland' 'Harp' 'Action/Adventure' 'Climate Change' 'Reading' 'Orchestral' 'Food & Drink' 'Wine tasting' 'Current Affairs' 'Exhibition' 'Mixed Media' 'Video art' 'Sound' 'Painting & Drawing' 'Objects' 'Illustration' 'Digital' 'Audio' 'Performance' 'Sport' 'Abba' 'Covers Band/Tribute Act' 'Foreign' 'French' 'Creedence Clearwater Revival' 'Rock' 'Creative Writing' 'Workshop' 'Discussion' 'Migration' 'Walks' 'Translation' 'Poetry Slam' 'Masterclass' 'Drumming' 'Party' 'Archaeology' 'Wine' 'Cabaret' 'Bon Jovi' 'Author events' 'Book launch' 'Book signing' 'Author Events' 'Song' 'Museum' 'Craft' 'Birds' 'Wildlife' 'wind ensemble' 'Traditional & Heritage' 'Candles' 'string quartet' 'Saxophone ensemble' 'Photography' 'Crafts' 'Sculpture' 'Solo piano' 'Ukulele' 'WWII' 'Pixar' 'Star Wars' 'Jumanji' 'Aardman' 'Paddington' 'Aquaman' 'Batman' 'DC Comics' 'DC Extended Universe' 'Superhero' 'Superman' 'Wonder Woman' 'Marvel Comics' 'Marvel Cinematic Universe' 'Thor' "Daddy's Home" 'Bad Moms' 'Maze Runner' 'Frozen' 'A Christmas Carol' 'Choral' 'Fairs' 'Politics' 'choral music' 'Easy listening' 'World' 'Visit Scotland' 'Around Town' 'Easter' 'Trail' 'Architecture & Design' 'Buildings' 'Medieval' 'Museums' 'Visual Art' 'Dramatization' 'Kilo Sale' 'Shopping' 'Fashion' 'Gardens' 'halloween' 'Dementia' 'Gin' 'Charity' 'Markets' 'Electronica' 'Volunteering' 'Craft fairs' 'Textiles' 'Painting & drawing' 'Talks & lectures' 'Smoking' 'Year of Homecoming 2014' 'Open mic' 'Coffee' 'Tastings' 'Book Week Scotland' 'Glasgow' 'wine tasting' 'Wine Tasting' 'Flowers' 'The HandleBards' 'Days Out' 'Festivals' 'Botany' 'Outdoor' 'Outdoors' 'Snowdrops' 'Exhibitions' 'Prints' 'Talkes & Lectures' 'Health & Wellbeing' 'Scottish produce' 'Food Events' 'Local produce' "Farmers' Markets" 'museum tour' 'Burns Night' 'Philanthropy' "New Year's Day" 'Swimming' 'Astronomy' 'Golf' 'Participation' 'Spectator' 'Choral recital' 'Choral music' 'Track & Field Athletics' 'Highland Games' 'Athletics' 'Scottish Games' 'Gala Days' 'Rugby' 'Community Event' 'Family Friendly' 'Fireworks display' 'Community' "St Andrew's Day" 'Whisky' 'Clubs' 'House' 'R&B' 'Quiz' 'Transport' 'Railways' 'Trains' 'A cappella' 'Burlesque' 'Techno' 'Chart & party' 'Disco' 'Festival ']
list_tags=['LGBT', 'Music','Film', 'Theatre', 'Kids', 'Books', 'Festival', 'Sport', 'Golf']
df_s_city_start_ts= dataframe_groupby_size(df_s_tags, ['event_tags', 'start_ts'] , "frequency")
for tag in list_tags:
fig=fig_histogram_subtype(df_s_city_start_ts, "start_ts", "event_tags", "frequency", tag, "Schedule (Tagged)", city)
fig.show()
df_s_city_category= dataframe_groupby_size(df_s_tags, ['event_tags', 'category'] , "frequency")
df_s_city_category.loc[df_s_city_category['event_tags'] == 'Golf']
| event_tags | category | frequency | |
|---|---|---|---|
| 209 | Golf | Days out | 3 |
| 210 | Golf | Sport | 1 |
df_s_city_category_v1=df_s_city_category.loc[(df_s_city_category['event_tags'] == 'Golf') | (df_s_city_category["event_tags"]=='Sport')]
fig_scatter=px.scatter(df_s_city_category_v1, x='category',y='frequency', color='event_tags', size='frequency', size_max=50, title="Frequency of Schedules tagged as Golf and Sport per category " )
fig_scatter.show()
fig_scatter=px.scatter(df_s_city_category, x='category',y='frequency', color='event_tags', size='frequency', size_max=50, title="Frequency of Tags per Schedule Category" )
fig_scatter.show()
df_p_properties = pd.read_pickle(dataframe_path+"/df_p_properties")
df_p=pd.read_pickle(dataframe_path+"/df_p")
df_p_no_prop=df_p[df_p.properties.isna()]
df_p_no_prop["performance.sold-out"]=False
df_p_no_prop["performance.cancelled"]=False
df_p_properties_v1=df_p_properties[['event_id', 'event_name', 'category', 'start_ts', 'end_ts', 'ts', "place_id", 'performance.cancelled', "performance.sold-out"]]
df_p_no_prop_v2=df_p_no_prop[['event_id', 'event_name', 'category', 'start_ts', 'end_ts', 'ts', "place_id", "performance.cancelled", "performance.sold-out"]]
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_61430/2665824026.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_61430/2665824026.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
frames = [df_p_properties_v1, df_p_no_prop_v2]
df_p_final = pd.concat(frames)
df_p_final_place = df_p_final.merge(df_places, on=['place_id','place_id'])
df_p_final_town=df_p_final_place.dropna(subset=['town'])
df_p_final_city=df_p_final_town[df_p_final_town['town'].isin([city])]
df_p_final_city['ts'] = pd.to_datetime(df_p_final_city['ts'], utc=True)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_61430/712792105.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
#df_p_final_city[df_p_final_city.event_id==1089256]
## NOW, LETS REMOVE ALL THE PERFORMANCES THAT HAVE BEEN CANCELLED
df_p_final_city.drop(df_p_final_city[df_p_final_city['performance.cancelled'] == True].index, inplace = True)
/Users/rf208/opt/anaconda3/envs/py39/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
## LETS RE-GROUP THE INFORMATION, SO WE HAVE JUST ONE ENTRY PER PERFORMANCE
## df_p_properties HAS ORIGINALLY SEVERAL ENTRIES PER PERFORMANCE - AS MANY AS PROPERTIES.
df_p_city_clean= df_p_final_city.groupby(['event_id', 'event_name', 'category', 'place_id', 'start_ts', 'end_ts', 'ts']).size().reset_index()
df_p_city_clean=df_p_city_clean.rename(columns={0: "num_properties"})
df_p_city_clean[df_p_city_clean["category"]=="Sport"]
| event_id | event_name | category | place_id | start_ts | end_ts | ts | num_properties | |
|---|---|---|---|---|---|---|---|---|
| 66 | 166804 | The Senior Open Presented by Rolex | Sport | 10180 | 2018-07-26T00:00:00+01:00 | 2018-07-29T00:00:00+01:00 | 2018-07-26 00:00:00+00:00 | 1 |
| 67 | 166804 | The Senior Open Presented by Rolex | Sport | 10180 | 2018-07-26T00:00:00+01:00 | 2018-07-29T00:00:00+01:00 | 2018-07-27 00:00:00+00:00 | 1 |
| 68 | 166804 | The Senior Open Presented by Rolex | Sport | 10180 | 2018-07-26T00:00:00+01:00 | 2018-07-29T00:00:00+01:00 | 2018-07-28 00:00:00+00:00 | 1 |
| 69 | 166804 | The Senior Open Presented by Rolex | Sport | 10180 | 2018-07-26T00:00:00+01:00 | 2018-07-29T00:00:00+01:00 | 2018-07-29 00:00:00+00:00 | 1 |
| 134 | 247734 | St Andrews Highland Games | Sport | 11017 | 2018-07-29T13:00:00+01:00 | 2018-07-29T13:00:00+01:00 | 2018-07-29 12:00:00+00:00 | 1 |
| 135 | 247734 | St Andrews Highland Games | Sport | 11017 | 2019-07-28T12:30:00+01:00 | 2019-07-28T12:30:00+01:00 | 2019-07-28 11:30:00+00:00 | 1 |
| 4374 | 1089256 | Orlando | Sport | 14738 | 2019-04-03T19:30:00+01:00 | 2019-04-03T19:30:00+01:00 | 2019-04-03 18:30:00+00:00 | 1 |
df_p_category, fig_scatter, fig_bar=dataframe_groupby_size(df_p_city_clean, ['category'], 'frequency', 'Performances', city)
fig_bar.show()
df_p_category
| category | frequency | |
|---|---|---|
| 4 | Days out | 4284 |
| 0 | Books | 1890 |
| 5 | Film | 772 |
| 13 | Visual art | 337 |
| 9 | Other | 285 |
| 8 | Music | 255 |
| 12 | Theatre | 162 |
| 11 | Talks & Lectures | 126 |
| 6 | Kids | 105 |
| 2 | Comedy | 57 |
| 3 | Dance | 45 |
| 7 | LGBT | 13 |
| 10 | Sport | 7 |
| 1 | Clubs | 3 |
## AND LETS PRINT AGAIN THE HISTOGRAM
fig_his=fig_histogram_color(df_p_city_clean, "ts", "category", "Performances", city)
fig_his.show()
###df_p_city_festival_clean : Just selecting the performances starting at a particular month across years.
tz_info = df_p_city_clean.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_p_city_festival_clean=df_p_city_clean[(df_p_city_clean['ts']>= date1) \
& (df_p_city_clean['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_p_city_clean[(df_p_city_clean['ts']>= date1) \
& (df_p_city_clean['ts'] < date2)]
df_p_city_festival_clean=pd.concat([df_p_city_festival_clean, tmp], axis=0)
len(df_p_city_festival_clean)
1099
fig_his=fig_histogram_color(df_p_city_festival_clean, "ts", "category", "Performances", city, period="Month")
fig_his.show()
df_p_city_category= dataframe_groupby_size(df_p_city_clean, ['category', 'ts'] , "frequency", period="Month")
for cat in list_categories:
print(cat)
fig=fig_histogram_subtype(df_p_city_category, "ts", "category", "frequency", cat, "Performances", city, period="Month")
fig.show()
Music
Visual art
Film
Days out
Books
Comedy
Theatre
Kids
LGBT
Festival
Sport
df_p_city_category_festival= dataframe_groupby_size(df_p_city_festival_clean, ['category', 'ts'] , "frequency", period="Month")
for cat in list_categories:
print(cat)
fig=fig_histogram_subtype(df_p_city_category_festival, "ts", "category", "frequency", cat, "Performances", city, period="Month")
fig.show()
Music
Visual art
Film
Days out
Books
Comedy
Theatre
Kids
LGBT
Festival
Sport
Important: The information of cancellations and sold-out are inside the performances properties. Therefore, here we need to work with the df_p_properties dataframe instead of df_p, because df_p does not have performances properties information.
df_p_properties = pd.read_pickle(dataframe_path+"/df_p_properties")
df_p_properties_place = df_p_properties.merge(df_places, on=['place_id','place_id'])
df_p_properties_town=df_p_properties_place.dropna(subset=['town'])
df_p_properties_city=df_p_properties_town[df_p_properties_town['town'].isin([city])]
df_p_properties_city['ts'] = pd.to_datetime(df_p_properties_city['ts'], utc=True)
len(df_p_properties_city)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_61430/2922426291.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
1294
###df_p_properties_festival : Just selecting the performances properties starting at a particular month across years.
tz_info = df_p_properties_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_p_properties_city_festival=df_p_properties_city[(df_p_properties_city['ts']>= date1) \
& (df_p_properties_city['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_p_properties_city[(df_p_properties_city['ts']>= date1) \
& (df_p_properties_city['ts'] < date2)]
df_p_properties_city_festival=pd.concat([df_p_properties_city_festival, tmp], axis=0)
len(df_p_properties_city_festival)
678
df_p_city_sold_out=df_p_properties_city.dropna(subset=['performance.sold-out'])
#df_p_city_sold_out.iloc[0]
fig_his=fig_histogram_color(df_p_city_sold_out, "ts", "performance.sold-out", "Performances", city)
fig_his.show()
df_p_city_sold_out_fesitval=df_p_properties_city_festival.dropna(subset=['performance.sold-out'])
fig_his=fig_histogram_color(df_p_city_sold_out_fesitval, "ts", "performance.sold-out", "Performances", city, period="Month")
fig_his.show()
df_p_city_cancelled=df_p_properties_city.dropna(subset=['performance.cancelled'])
fig_his=fig_histogram_color(df_p_city_cancelled, "ts", "performance.cancelled", "Performances", city)
fig_his.show()
df_p_city_cancelled_festival=df_p_properties_city_festival.dropna(subset=['performance.cancelled'])
fig_his=fig_histogram_color(df_p_city_cancelled_festival, "ts", "performance.cancelled", "Performances", city, period="Month")
fig_his.show()
We are going to plot in maps the performances places over the years. Three types per visualizations:
from IPython.display import IFrame
px.set_mapbox_access_token(open("mapbox_token").read())
df_city_place= df_p_city_clean.merge(df_places, on=['place_id','place_id'])
df_p_town=df_city_place.dropna(subset=['town'])
df_p_city=df_p_town[df_p_town['town'].isin([city])]
df_p_city['ts'] = pd.to_datetime(df_p_city['ts'], utc=True)
df_p_city2=df_p_city.dropna(subset=['loc'])
df_loc_city=pd.concat([df_p_city2.drop(['loc'], axis=1), df_p_city2['loc'].apply(pd.Series)], axis=1)
df_perf_loc_city= df_loc_city.groupby(['event_id', 'event_name', 'category', 'start_ts', 'end_ts', 'ts', 'town', 'place_id', 'name', 'latitude', 'longitude']).size().reset_index()
df_perf_loc_city=df_perf_loc_city.rename(columns={0: "number_of_perf"}).sort_values(by=['number_of_perf'], ascending=False)
df_perf_loc_city[['start_ts','end_ts', 'ts']] = df_perf_loc_city[['start_ts','end_ts', 'ts']].apply(pd.to_datetime)
df_perf_loc_city['latitude'] = df_perf_loc_city['latitude'].astype(float)
df_perf_loc_city['longitude'] = df_perf_loc_city['longitude'].astype(float)
df_perf_loc_city=df_perf_loc_city.round(8)
df_perf_loc_city['ts'] = pd.to_datetime(df_perf_loc_city['ts'], utc=True)
df_perf_loc_city=df_perf_loc_city.sort_values(by=['ts'], ascending=True)
##df_perf_loc_city["year"]=pd.DatetimeIndex(pd.to_datetime(df_perf_loc_city['ts'], utc=True)).year
df_perf_loc_city["month"]=pd.to_datetime(df_perf_loc_city['ts'], utc=True).dt.strftime('%m-%Y')
fig=px.scatter_mapbox(df_perf_loc_city, lat="latitude", lon="longitude", animation_frame="month",
title="Performances Places in "+ city +" from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='performance_places_'+city_sort_name+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
df_perf_loc_city_count=df_perf_loc_city.groupby(['latitude','longitude', 'month']).count().reset_index()
df_perf_loc_city_count=df_perf_loc_city_count.rename(columns={"event_id": "frequency"})
df_perf_loc_city_count=df_perf_loc_city_count.round(8)
df_perf_loc_city_count["name"]=None
df_perf_loc_city_count["place_id"]=None
for index, row in df_perf_loc_city_count.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=df_perf_loc_city[(df_perf_loc_city.latitude == lat) & (df_perf_loc_city.longitude == long)]['place_id'].head(1).iloc[0]
place_name=df_perf_loc_city[(df_perf_loc_city.latitude == lat) & (df_perf_loc_city.longitude == long)]['name'].head(1).iloc[0]
df_perf_loc_city_count.at[index, 'name']=place_name
df_perf_loc_city_count.at[index, 'place_id']=place_id
df_perf_loc_city_count['month'] = pd.to_datetime(df_perf_loc_city_count['month'], utc=True)
df_perf_loc_city_count=df_perf_loc_city_count.sort_values(by=['month'], ascending=True)
df_perf_loc_city_count["month"]=pd.to_datetime(df_perf_loc_city_count['month'], utc=True).dt.strftime('%m-%Y')
max_freq=df_perf_loc_city_count["frequency"].max()
fig=px.scatter_mapbox(df_perf_loc_city_count, lat="latitude", lon="longitude", animation_frame="month",
color="frequency", size="frequency",
range_color=(0,max_freq),
title="Performances Places Frequency in "+ city + " from 2017 to 2022",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=30, zoom=10, hover_name="name")
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='freq_performance_places_'+city_sort_name+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
(Just During the Selected Month Over the Years)
tz_info = df_perf_loc_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
city_festival=df_perf_loc_city[(pd.to_datetime(df_perf_loc_city['ts'], utc=True) >= date1) & (pd.to_datetime(df_perf_loc_city['ts'], utc=True) <= date2)]
for year in range(2018, 2022):
tz_info = df_perf_loc_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp=df_perf_loc_city[(pd.to_datetime(df_perf_loc_city['ts'], utc=True) >= date1) & (pd.to_datetime(df_perf_loc_city['ts'], utc=True) <= date2)]
city_festival=pd.concat([city_festival, tmp], axis=0)
city_festival['ts'] = pd.to_datetime(city_festival['ts'], utc=True)
city_festival=city_festival.sort_values(by=['ts'], ascending=True)
##df_perf_loc_city["year"]=pd.DatetimeIndex(pd.to_datetime(df_perf_loc_city['ts'], utc=True)).year
city_festival["day"]=pd.to_datetime(city_festival['ts'], utc=True).dt.strftime('%Y-%m-%d')
fig=px.scatter_mapbox(city_festival, lat="latitude", lon="longitude", animation_frame="day",
animation_group="name" ,
title="Performances Places in "+ city + " During the of " + month_string+ " (2018 to 2021)",
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='performance_places_'+city_sort_name+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
city_festival_count=city_festival.groupby(['latitude','longitude', 'day']).count().reset_index()
city_festival_count=city_festival_count.rename(columns={"event_id": "frequency"})
city_festival_count=city_festival_count.round(8)
city_festival_count["name"]=None
city_festival_count["place_id"]=None
for index, row in city_festival_count.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['place_id'].head(1).iloc[0]
place_name=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['name'].head(1).iloc[0]
city_festival_count.at[index, 'name']=place_name
city_festival_count.at[index, 'place_id']=place_id
max_frequency_place_fest=city_festival_count["frequency"].max()
city_festival_count=city_festival_count.sort_values(by=['day'], ascending=True)
fig=px.scatter_mapbox(city_festival_count, lat="latitude", lon="longitude", animation_frame="day",
#animation_group="category",
color= "frequency",
range_color=(0,max_frequency_place_fest),
size="frequency",
mapbox_style="carto-positron",
zoom=10,
title=" Performances Places Frequency in "+ city+ " During the of " + month_string+ " (2018 to 2021)",
size_max=50, hover_name="name")
#fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 50
#fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='freq_performance_places_'+city_sort_name+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
city_festival_cat=city_festival
city_festival_cat=city_festival_cat.loc[city_festival_cat['category'].isin(list_categories)]
total_days=city_festival_cat.day.unique()
for day in total_days:
subdata=city_festival_cat.loc[(city_festival_cat['day'] == day)]
if set(subdata.category.unique()) != set(list_categories):
different= set(subdata.category.unique()).symmetric_difference(set(list_categories))
for cat_diff in different:
new_row = { 'event_id':0, 'latitude':0.0, 'longitude':0.0, 'day':day, \
'category': cat_diff, 'frequency':0, 'event_name':0, \
'town':'None', 'place_id':0, 'name':'None',
'start_ts':0 , 'end_ts':0, 'ts':0, 'town':0, 'place_id':0,\
'name': 'None', 'number_of_perf':0, 'month':0}
city_festival_cat = city_festival_cat.append(new_row, ignore_index=True)
city_festival_cat2=city_festival_cat.loc[city_festival_cat.latitude!=0]
lat_mean=city_festival_cat2.latitude.mean()
long_mean=city_festival_cat2.longitude.mean()
fig=px.scatter_mapbox(city_festival_cat, lat="latitude", lon="longitude", animation_frame="day",
animation_group="name" ,
title="Performances Places per Category in "+ city + " During the of " + month_string+ " (2018 to 2021)",
color='category',
center=dict(lon=long_mean, lat=lat_mean),
color_continuous_scale=px.colors.cyclical.IceFire, size_max=20, zoom=10, hover_name="name")
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='performance_places_category_'+city_sort_name+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
city_festival_count_cat1=city_festival.groupby(['latitude','longitude', 'day', 'category']).count().reset_index()
city_festival_count_cat=city_festival_count_cat1.loc[city_festival_count_cat1['category'].isin(list_categories)]
city_festival_count_cat=city_festival_count_cat.rename(columns={"event_id": "frequency"})
city_festival_count_cat=city_festival_count_cat.round(8)
city_festival_count_cat["name"]=None
city_festival_count_cat["place_id"]=None
for index, row in city_festival_count_cat.iterrows():
lat=row['latitude']
long=row['longitude']
place_id=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['place_id'].head(1).iloc[0]
place_name=city_festival[(city_festival.latitude == lat) & (city_festival.longitude == long)]['name'].head(1).iloc[0]
city_festival_count_cat.at[index, 'name']=place_name
city_festival_count_cat.at[index, 'place_id']=place_id
total_days=city_festival_count_cat.day.unique()
for day in total_days:
subdata=city_festival_count_cat.loc[(city_festival_count_cat['day'] == day)]
if set(subdata.category.unique()) != set(list_categories):
different= set(subdata.category.unique()).symmetric_difference(set(list_categories))
for cat_diff in different:
new_row = {'latitude':0.0, 'longitude':0.0, 'day':day, \
'category': cat_diff, 'frequency':0, 'event_name':0, \
'start_ts':0 , 'end_ts':0, 'ts':0, 'town':0, 'place_id':0,\
'name': 'None', 'number_of_perf':0, 'month':0}
city_festival_count_cat = city_festival_count_cat.append(new_row, ignore_index=True)
city_festival_count_cat=city_festival_count_cat.sort_values(by=['day'], ascending=True)
city_festival_count_cat2=city_festival_count_cat.loc[city_festival_count_cat.latitude!=0]
lat_mean=city_festival_count_cat2.latitude.mean()
long_mean=city_festival_count_cat2.longitude.mean()
fig=px.scatter_mapbox(city_festival_count_cat, lat="latitude", lon="longitude", animation_frame="day",
animation_group="category",
color= "category",
mapbox_style="carto-positron",
size='frequency',
zoom=10,
size_max=50,
center=dict(lon=long_mean, lat=lat_mean),
title=" Performances Places Frequency per Category in "+ city+ " During the of " + month_string+ " (2018 to 2021)",
hover_name="name")
#fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 50
#fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 3000
## unc-comment this line if you want to show the map
#fig.show()
## comment these liens if you dont want save the map
fig_name='freq_performance_places_category_'+city_sort_name+'_'+month_string+'.html'
fig.write_html(html_path+'/'+fig_name)
### Trick: Just for displaying the map in the final HTML - you wont need to do this .
IFrame(src="https://storage.googleapis.com/case_study_list/"+fig_name, width=1000, height=600)
df_tickets = pd.read_pickle(dataframe_path+"/df_tickets")
df_t_place = df_tickets.merge(df_places, on=['place_id','place_id'])
df_t_town=df_t_place.dropna(subset=['town'])
df_t_city=df_t_town[df_t_town['town'].isin([city])]
df_t_city['ts'] = pd.to_datetime(df_t_city['ts'], utc=True)
len(df_t_city)
/var/folders/4w/57rlv27n3g9b001td63pv1j00000gq/T/ipykernel_61430/3613812720.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
14023
len(df_t_city)
14023
###df_p_city : Just selecting the performances properties starting at a particular month across years.
tz_info = df_t_city.iloc[0].ts.tzinfo
date1 = datetime.datetime(2017, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(2017, month+1, 1, tzinfo=tz_info)
df_t_city_festival=df_t_city[(df_t_city['ts']>= date1) \
& (df_t_city['ts'] < date2)]
for year in range(2018, 2022):
date1 = datetime.datetime(year, month, 1, tzinfo=tz_info)
date2 = datetime.datetime(year, month+1, 1, tzinfo=tz_info)
tmp =df_t_city[(df_t_city['ts']>= date1) \
& (df_t_city['ts'] < date2)]
df_t_city_festival=pd.concat([df_t_city_festival, tmp], axis=0)
len(df_t_city_festival)
1428
df_t_category, fig_scatter, fig_bar=dataframe_groupby_size(df_t_city, ['category'], 'frequency', 'Tickets', city)
fig_bar.show()
df_t_category_festival, fig_scatter, fig_bar=dataframe_groupby_size(df_t_city_festival, ['category'], 'frequency', 'Tickets', city, period="Month")
fig_bar.show()
fig_his=fig_histogram_color(df_t_city, "ts", "category", "Tickets", city)
fig_his.show()
df_t_type_city= df_t_city.dropna(subset=['type'])
fig_his=fig_histogram_color(df_t_type_city, "ts", "type", "Tickets", city)
fig_his.show()
print(df_t_type_city.type.unique())
['Standard' 'Concession' 'Family' 'additional children ' 'Children' 'Carers' 'Members' 'Students' 'early bird' 'ages 10--18' 'under 10s' 'Students & under 26s' 'Seniors' 'under 26s' 'under 18s' 'schoolchildren' 'Breakfast with Santa' "Santa's Grotto" 'Easter Bunny Trail & Crafts' 'Members and students' 'Concession and students' 'Concession & students' 'Concessions & students']
fig_his=fig_histogram_color(df_t_city_festival, "ts", "category", "Tickets", city, period="Month")
fig_his.show()
df_t_type_city_festival= df_t_city_festival.dropna(subset=['type'])
fig_his=fig_histogram_color(df_t_type_city_festival, "ts", "type", "Tickets", city, period="Month")
fig_his.show()
## only max_price==0.0
df_f_price_city= df_t_city[df_t_city["t_revenue"]== 0.0]
fig_his=fig_histogram_color(df_f_price_city, "ts", "category", "Tickets", city)
fig_his.show()
## only max_price==0.0
df_f_price_city_festival= df_t_city_festival[df_t_city_festival["t_revenue"]== 0.0]
fig_his=fig_histogram_color(df_f_price_city_festival, "ts", "category", "Tickets", city, period="Month")
fig_his.show()
df_t_city_category= dataframe_groupby_size(df_t_city, ['category', 'ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_t_city_category, "ts", "category", "frequency", cat, "Tickets", city)
fig.show()
df_t_city_category_festival= dataframe_groupby_size(df_t_city_festival, ['category', 'ts'] , "frequency")
for cat in list_categories:
fig=fig_histogram_subtype(df_t_city_category_festival, "ts", "category", "frequency", cat, "Tickets", city, period="Month")
fig.show()
IMPORTAN This section is very computing-intensive. It could take a long time to run if you have selected a city with a high number of events ( e.g. Edinburgh)
We are going to calculate the topic modelling of the descriptions events placed for the selected city.
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
import re
from hdbscan import HDBSCAN
df_desc_town = pd.read_pickle(dataframe_path+"/df_desc_town")
df_desc_city=df_desc_town[df_desc_town['town'].isin([city])]
df_desc_city=df_desc_city.dropna(subset=['event_description']).reset_index()
documents=df_desc_city["event_description"].values
d=[]
for text in documents:
d.append(clean_documents(text))
#d.append(text)
len(d)
1933
#un-comment these lines if you want to generate and save the text-embeddings again
## Using all-mpnet-base-v2 Transformer
model = SentenceTransformer('all-MiniLM-L6-v2')
#Training our text_embeddings - using the descriptions available & all-MiniLM-L6-v2 Transformer
text_embeddings = model.encode(d, batch_size = 8, show_progress_bar = True)
city_embeddings=np.array(text_embeddings)
embeddings_name=models_path+"/"+city_sort_name+"_embeddings.npy"
np.save(embeddings_name, city_embeddings)
Batches: 0%| | 0/242 [00:00<?, ?it/s]
#comment thes lines if you dont want to read the text_embeddings from file
embeddings_name=models_path+"/"+city_sort_name+"_embeddings.npy"
text_embeddings = np.load(embeddings_name)
We have tested two possible configurations for training the topic model:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# un-comment these lines if you want to train the topic model
### We have two options for generating the topic model
##### Option 1: Without HDBSCAN
#topic_model = BERTopic(verbose=True, nr_topics="auto", top_n_words=8).fit(d, text_embeddings)
#bertopic_name=models_path+"/"+city_sort_name+"_BertTopic_Model"
#topic_model.save(bertopic_name)
#### Option 2: Using HDBSCAN
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean',
cluster_selection_method='eom', prediction_data=True, min_samples=5)
topic_model = BERTopic(hdbscan_model=hdbscan_model, nr_topics="auto").fit(d, text_embeddings)
bertopic_name=models_path+"/"+city_sort_name+"_BertTopic_Model_HDBSCAN"
topic_model.save(bertopic_name)
#comment these lines if you dont want to read the topic modelling from the file
### Atention - You have two options again, with and without HDBSCAN
##### Option 1: Without HDBSCAN
bertopic_name=models_path+"/"+city_sort_name+"_BertTopic_Model"
topic_model = BERTopic.load(bertopic_name)
#### Option 2: Using HDBSCAN
#bertopic_name=models_path+"/"+city_sort_name+"_BertTopic_Model_HDBSCAN"
#topic_model = BERTopic.load(bertopic_name)
len(d)
1933
#Lets see the frequency of the first 10 topics
topic_model.get_topic_freq()[0:10]
| Topic | Count | |
|---|---|---|
| 0 | -1 | 617 |
| 1 | 0 | 184 |
| 2 | 1 | 177 |
| 3 | 2 | 153 |
| 4 | 3 | 70 |
| 5 | 4 | 64 |
| 6 | 5 | 58 |
| 7 | 6 | 56 |
| 8 | 7 | 55 |
| 9 | 8 | 51 |
topics, probs = topic_model.fit_transform(d, text_embeddings)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_topics()
#topic_model.get_topics()
topic_model.get_topic(0)
[('book', 0.015846167600741864),
('new', 0.015647298321825265),
('novel', 0.014302809352303865),
('history', 0.013182917634057004),
('author', 0.012160257457676502),
('life', 0.011626435246681098),
('but', 0.010364271935179905),
('st', 0.01023771938493879),
('latest', 0.00974591275193944),
('this', 0.009700655330193568)]
df_topic_city=df_desc_city
df_topic_city['topic_lda'] = topics
df_topic_city[["category"]]=None
df_topic_city[["lda_topics"]]=None
df_topic_city[["topic_lda_name"]]=None
for index, row in df_topic_city.iterrows():
event_id=row['event_id']
category=df_events[df_events.event_id == event_id].category.values[0]
df_topic_city.at[index,"category"]=category
topic_num=row['topic_lda']
topic_list=topic_model.get_topic(topic_num)
t=[]
for i in topic_list:
t.append(i[0])
df_topic_city.at[index,"lda_topics"]=t
n_t=t[0:5]
pp=str(topic_num)+"_"+'_'.join(n_t)
df_topic_city.at[index, 'topic_lda_name']=pp
category_list=df_topic_city["category"].unique()
category_id={}
id=0
for i in category_list:
category_id[i]=id
id+=1
category_id
for index, row in df_topic_city.iterrows():
category=row['category']
df_topic_city.at[index, "category_id"]=category_id[category]
df_topic_city_category= df_topic_city.groupby(['event_id', 'category']).size().reset_index()
df_topic_city_category.rename(columns={0: "number_of_times"}, inplace=True)
df_topic_city_category_1= df_topic_city_category.groupby(['category'])['number_of_times'].sum().reset_index()
df_topic_city_category_1=df_topic_city_category_1.rename(columns={'number_of_times': "sum_num_of_times"}).sort_values(by=['sum_num_of_times'], ascending=False).reset_index()
df_topic_city_category_lda= df_topic_city.groupby(['topic_lda', 'category']).size().reset_index()
df_topic_city_category_lda.rename(columns={0: "number_of_times"}, inplace=True)
px.scatter(df_topic_city_category_lda, x="category",y='topic_lda', color='topic_lda', size="number_of_times", size_max=30, title="Frequency of events of per category")
px.scatter(df_topic_city_category_lda, x="topic_lda",y='category', color='category', size="number_of_times", size_max=30, title="Frequency of events of per category")